Megakaryocyte targeting via LNA pulldown

Megakaryocytes represent a small proportion of the cells in a typical PBMC sample and therefore were selected as an additional test of the LNA pulldown techinique. Four megakaryocyte cells were reamplified after LNA pulldown.

Following reamplification the 4 cell libraries were pooled and resequenced together. The raw fastqs were then processed using a Snakemake pipeline, to produce two processed data files:

  1. A matrix with UMIs per cell (column) per gene (rows) (dge_matrix.txt)
  2. A flatfile with per UMI information (umigroups.txt.gz)

This RMarkdown document will produce the following figures:

  1. …
  2. …
  3. …

Organize single cell data

First designate the libraries and the cells that were resampled.

cells <- list(
  mkcell_pulldown = c("TGCGCAGCAGGTCGTC",
                      "ACTTGTTAGGACCACA",
                      "CCATTCGTCCCTGACT",
                      "TGTCCCAGTAAACACA"))

libs <- c(
  "kirkpatrick",
  "mkcell_pulldown")

bc_metadat <- read_tsv(file.path(data_dir, 
                         "kirkpatrick", 
                         "fastq",
                         "original", 
                         "barcodes_from_10x_run.txt"),
                         col_names = c("cell_id", "barcode_10x"))

## original library to compare against
reflib <- "kirkpatrick"
## all resampled libs to plot
resampled_libs <- "mkcell_pulldown"
## reference resampled lib for resampled vs control plots
resampled_lib <- "mkcell_pulldown"

## pretty name for libraries
lib_names = c(
  kirkpatrick = "Original Library",
  mkcell_pulldown = "Resampled Library"
)

## pretty names for cells
cell_names = c(
  "TGCGCAGCAGGTCGTC-1" = "MK-cell-1",
  "ACTTGTTAGGACCACA-1" = "MK-cell-2",
  "CCATTCGTCCCTGACT-1" = "MK-cell-3",
  "TGTCCCAGTAAACACA-1" = "MK-cell-4")

Load and organize a table for each library of read counts per cell per gene, and a table of umi counts per cell per gene.

umis_to_genes <- function(umipath, cells_to_exclude = c("Cell_unmatched")){
  umis <- read_tsv(umipath,
                   col_names = c("barcode_10x", 
                                 "umi_molecule", 
                                 "count")) %>% 
    filter(barcode_10x != cells_to_exclude)
  
  mol_fields <- str_count(umis$umi_molecule[1], "::")
  
  if(mol_fields == 2){
    umis <- separate(umis, umi_molecule, 
                     into = c("seq", "genome", "gene"),
                     sep = "::") %>% 
      mutate(gene = str_c(genome, "::", gene))
  } else if (mol_fields == 1){
    umis <- separate(umis, umi_molecule, 
                     into = c("seq", "gene"),
                     sep = "::")
  } else {
    stop("separator :: missing from umi_molecule field")
  }
  
  reads <- select(umis, 
                  barcode_10x, 
                  gene,
                  count)
  
  reads <- group_by(reads, 
                   barcode_10x, gene) %>% 
    summarize(counts = sum(count))
  
  reads <- spread(reads, barcode_10x, counts, 
                  fill = 0L)
  
  reads
}

## read in umigroups flat file with read counts per umi per gene per cell
## expand out to a read count matrix
umipaths <- file.path(data_dir, 
                      libs, 
                      "umis",
                      "umigroups.txt.gz")
read_dat <- map(umipaths, 
                ~umis_to_genes(.))
names(read_dat) <- libs

## read in umi_tools count table with umi counts per gene per cell
umi_dat <- map(libs, 
                ~read_tsv(file.path(data_dir, 
                          .x,
                          "dgematrix",
                          "dge_matrix.txt")) %>% 
                 select(-Cell_unmatched))
names(umi_dat) <- libs

cell_obj_mdata <- map(cells, 
                      ~mutate(bc_metadat, 
                              resampled = ifelse(barcode_10x %in% .x,
                                                  TRUE,
                                                  FALSE)))

Next organize these tables into simple classes called resampled-sets to keep track of each experiment’s relavant raw, processed, and meta data.

#' simple class to hold info for each experiment
create_sc_obj <- function(umi_df,
                          read_df,
                          cell_mdata_df){
  x <- list()
  class(x) <- "resampled-set"
  x$umis <- umi_df
  x$reads <- read_df
  x$meta_dat <- cell_mdata_df
  return(x)
}

sc_objs <- list(umi_dat, read_dat, cell_obj_mdata)
sc_objs <- pmap(sc_objs, create_sc_obj)

rm(umi_dat)
rm(read_dat)

Next perform basic processing. 1) generate separate objects to store sparse matrices of umi and read counts. 2) normalize read and umi count data by total library size (sum of all read or umi counts for all cells in the experiment) and report as Reads per million or UMIs per million. 3) Compute per cell metrics (read and umi counts, sequencing saturation)

tidy_to_matrix <- function(df){
   df <- as.data.frame(df)
   rownames(df) <- df[, 1]
   df[, 1] <- NULL
   mat <- as.matrix(df)
   mat <- as(mat, "sparseMatrix")   
   return(mat)
}

#' keep both tidy and matrix objs
generate_matrices <- function(sc_obj){
  sc_obj$umi_matrix <- tidy_to_matrix(sc_obj$umis)
  sc_obj$read_matrix <- tidy_to_matrix(sc_obj$reads)
  sc_obj
}

#' normalize by library size (Reads per Million)
norm_libsize <- function(sc_obj){
  sc_obj$norm_umi <- 1e6 * sweep(sc_obj$umi_matrix, 2, 
                                 sum(as.vector(sc_obj$umi_matrix)), "/")
  sc_obj$norm_reads <- 1e6 * sweep(sc_obj$read_matrix, 2, 
                                   sum(as.vector(sc_obj$read_matrix)), "/")
  sc_obj
}

add_metadata <- function(sc_obj, dat){
  if (is.vector(dat)){
    new_colname <- deparse(substitute(dat))
    df <- data_frame(!!new_colname := dat)
    df[[new_colname]] <- dat
    df[["cell_id"]] <- names(dat)
    sc_obj$meta_dat <- left_join(sc_obj$meta_dat,
                                 df,
                                 by = "cell_id")
    
  } else if (is.data.frame(dat)) {
    sc_obj$meta_dat <- left_join(sc_obj$meta_dat,
                                 dat,
                                 by = "cell_id")
  }
  sc_obj
}

compute_summaries <- function(sc_obj){
  ## raw counts
  total_umis <- colSums(sc_obj$umi_matrix)
  names(total_umis) <- colnames(sc_obj$umi_matrix)
  total_reads <- colSums(sc_obj$read_matrix)
  names(total_reads) <- colnames(sc_obj$read_matrix)
  
  ## norm counts
  norm_total_umis <- colSums(sc_obj$norm_umi)
  names(norm_total_umis) <- colnames(sc_obj$norm_umi)
  norm_total_reads <- colSums(sc_obj$norm_reads)
  names(norm_total_reads) <- colnames(sc_obj$norm_reads)
    
  sc_obj <- add_metadata(sc_obj, total_umis)
  sc_obj <- add_metadata(sc_obj, total_reads)
  sc_obj <- add_metadata(sc_obj, norm_total_umis)
  sc_obj <- add_metadata(sc_obj, norm_total_reads)
  
  ## compute cDNA duplication rate 
  sc_obj$meta_dat$cDNA_duplication <- 1 - (sc_obj$meta_dat$total_umis /
                                             sc_obj$meta_dat$total_reads)
  
  sc_obj
}

sc_objs <- map(sc_objs, generate_matrices)
sc_objs <- map(sc_objs, norm_libsize)
sc_objs <- map(sc_objs, compute_summaries)

Compute enrichment of reads/umis over the original library.

sc_objs <- map(sc_objs,
    function(sub_dat){
      og_counts <- select(sc_objs[[reflib]]$meta_dat,
                          og_total_reads = total_reads,
                          og_total_umis = total_umis,
                          og_norm_total_umis = norm_total_umis,
                          og_norm_total_reads = norm_total_reads,
                          og_cDNA_duplication = cDNA_duplication,
                          cell_id)
      sub_dat$meta_dat <- left_join(sub_dat$meta_dat,
                         og_counts, 
                         by = "cell_id")
      
      sub_dat$meta_dat <- mutate(sub_dat$meta_dat,
                                 read_proportion = log2( total_reads / og_total_reads),
                                 umi_proportion = log2( total_umis / og_total_umis),
                                 norm_read_proportion = log2( norm_total_reads /
                                                                og_norm_total_reads),
                                 norm_umi_proportion = log2( norm_total_umis /
                                                               og_norm_total_umis))
      sub_dat
    })

cDNA duplication rate

sc_metadat <- map(sc_objs, ~.x$meta_dat) %>% 
  bind_rows(.id = "library") %>% 
  mutate(library = factor(library, levels = libs)) %>% 
  arrange(resampled)

plt <- ggplot(sc_metadat, aes(total_umis, cDNA_duplication)) +
  geom_point(aes(color = resampled), size = 0.5) +
  labs(x = expression(paste("# of UMIs")),
       y = "Sequencing saturation") + 
  scale_x_log10(labels = scales::comma) +
  scale_color_manual(values = color_palette) +
  guides(colour = guide_legend(override.aes = list(size = 5))) + 
  facet_wrap(~library,
             labeller = labeller(library = lib_names)) +
  theme_cowplot(font_size = 16, line_size = 1)  +
  theme(legend.position = "top",
        plot.margin = unit(c(5.5, 20.5, 5.5, 5.5), 
                           "points")) 

plt
Note the high level of sequencing saturation (0 = no-duplication, 1 = all duplicates) in the original library. Also note that the libraries tend to have higher saturatioin rates, after subsampling.

Note the high level of sequencing saturation (0 = no-duplication, 1 = all duplicates) in the original library. Also note that the libraries tend to have higher saturatioin rates, after subsampling.

save_plot("cDNA_duplication.pdf", plt, 
          base_aspect_ratio = 1.25)

Read and umi counts per library

global_plot_theme <- theme(
        legend.position = "top",
        legend.text = element_text(size = 10),
        strip.text = element_text(size = 8))

resampled_metadat <- sc_metadat[sc_metadat$library != reflib, ] %>% 
  mutate(library = factor(library, 
                          levels = c(resampled_libs)))

unnorm_plt <- ggplot(resampled_metadat, 
                     aes(og_total_reads / 3, total_reads / 3, 
                         colour = resampled)) + 
  geom_point(size = 0.5) + 
  geom_abline(slope = 1) +
  facet_wrap(~library, nrow = 1) + 
#  coord_fixed() +
  xlab("original library\nreads count (Thousands)") +
  ylab("resampled library\nreads count (Thousands)") +
 # ggtitle("Raw reads associated with each cell barcode") +
  scale_colour_manual(name = "resampled:",
                      values = color_palette) +
  theme_cowplot(font_size = 10, line_size = .5) +
  theme(aspect.ratio = 1) + 
  global_plot_theme

norm_plt <- ggplot(resampled_metadat, aes(og_norm_total_reads / 1e3, 
                                           norm_total_reads / 1e3, 
                                           colour = resampled)) + 
  geom_point(size = 0.5) + 
  geom_abline(slope = 1) + 
  facet_wrap(~library, nrow = 1) + 
  xlab("original library\nRPM (Thousands)") +
  ylab("resampled library\nRPM (Thousands)") +
  scale_colour_manual(name = "resampled:",
                      values = color_palette) +
  theme_cowplot(font_size = 10, line_size = 0.5) +
  theme(aspect.ratio = 1) + 
  global_plot_theme

unnorm_umi_plt <- ggplot(resampled_metadat, 
                         aes(og_total_umis / 1e3, 
                             total_umis / 1e3, colour = resampled)) + 
  geom_point(size = 0.5) + 
  geom_abline(slope = 1) +
 # coord_fixed() +
  facet_wrap(~library, nrow = 1) + 
  xlab("original library\nUMI count (Thousands)") +
  ylab("resampled library\nUMI count (Thousands)") +
  scale_colour_manual(name = "resampled:",
                      values = color_palette) +
  theme_cowplot(font_size = 10, line_size = .5) +
    theme(aspect.ratio = 1) + 
  global_plot_theme

norm_umi_plt <- ggplot(resampled_metadat, 
                       aes(og_norm_total_umis / 1e3, 
                           norm_total_umis / 1e3,
                           colour = resampled)) + 
  geom_point(size = 0.5) + 
  geom_abline(slope = 1) + 
  facet_wrap(~library, nrow = 1) + 
  xlab("original library\nUMI normalized RPM (Thousands)") +
  ylab("resampled library\nUMIs per Million (Thousands)") +
  scale_colour_manual(name = "resampled:",
                      values = color_palette) +
  theme_cowplot(font_size = 10, line_size = 0.5) +
  theme(aspect.ratio = 1) + 
  global_plot_theme

plt <- plot_grid(unnorm_plt, norm_plt, unnorm_umi_plt, norm_umi_plt, 
                 labels = "AUTO",
                 align = 'hv')
plt

save_plot("reads_per_barcode_scatterplots.pdf", plt, 
          nrow = 2, ncol = 2, base_width = 8 )

Enrichment of reads/umis

read <- ggplot(resampled_metadat, 
       aes(cell_id, norm_read_proportion, colour = resampled)) + 
  geom_point() + 
  labs(x = "Cell", 
       y = expression(paste( " Log"[2], " normalized reads ", 
                             frac("resampled", "original")))) +
  scale_colour_manual(name = "resampled:", values = color_palette) +
  facet_wrap(~library, nrow = 1) + 
  theme_cowplot(font_size = 16, line_size = 1) +
  theme(axis.text.x = element_blank(),
        legend.position = "top",
        legend.text = element_text(size = 12))

umi <- ggplot(resampled_metadat, 
       aes(cell_id, norm_umi_proportion, colour = resampled)) + 
  geom_point() + 
  labs(x = "Cell", 
       y = expression(paste( "Log"[2], " normalized UMIs ", frac("resampled", "original")))) +
  scale_colour_manual(name = "resampled:", values = color_palette) +
  facet_wrap(~library, nrow = 1) + 
  theme_cowplot(font_size = 16, line_size = 1) +
  theme(axis.text.x = element_blank(),
        legend.position = "top",
        legend.text = element_text(size = 12))

plt <- plot_grid(read, umi,
                 labels = "AUTO",
                 align = 'hv')
plt

ggsave("reads_umi_ratio_per_barcode_normalized.pdf", width = 8, height = 5)

umi <- ggplot(filter(resampled_metadat, library %in% resampled_libs), 
       aes(cell_id, norm_umi_proportion, colour = resampled)) + 
  geom_point() + 
  labs(x = "Cell", 
       y = expression(paste( "Log"[2], " normalized UMIs ", frac("resampled", "original")))) +
  scale_colour_manual(name = "resampled:", values = color_palette) +
  facet_wrap(~library, nrow = 1) + 
  theme_cowplot(font_size = 16, line_size = 1) +
  theme(axis.text.x = element_blank(),
        legend.position = "top",
        legend.text = element_text(size = 12))

umi

ggsave("umi_ratio_per_cell.pdf", umi, width =5, height = 5)

umi_plots <- map(split(resampled_metadat, resampled_metadat$library),
  function(x){
    ggplot(x, 
       aes(og_total_umis, norm_umi_proportion, colour = resampled)) + 
  geom_point(size = 0.5) + 
  geom_hline(aes(yintercept = 0), 
             linetype ="dashed", 
             color = "darkgrey") + 
  labs(x = "Abundance in original library\n (UMIs)", 
       y = expression(paste( "Log"[2], " UMIs ", 
                             frac("resampled", "original")))) +
  scale_x_continuous(labels = scales::comma) +      
  scale_colour_manual(name = "resampled:", values = color_palette) +
  guides(colour = guide_legend(override.aes = list(size = 5))) + 
  theme_cowplot(font_size = 16, line_size = 1) +
  theme(legend.position = "top",
        legend.text = element_text(size = 12),
        plot.margin = unit(c(5.5, 20.5, 5.5, 5.5),
                           "points"))})

plt <- plot_grid(plotlist = umi_plots, nrow = 1)
save_plot("umi_ratio_MA.pdf", plt)
ggplot(resampled_metadat, aes(resampled, 
                read_proportion, fill = resampled)) + 
  geom_boxplot(coef = Inf) + 
  facet_wrap(~library, nrow = 1) +
  xlab("Selected for Reamplification") +
  ylab(expression(paste( "Log"[2], " Reads ", frac("resampled", "original")))) +
  scale_fill_manual(name = "resampled:",
                    values = color_palette) +
  theme_cowplot(font_size = 16, line_size = 0.5) +
  theme(
    axis.title.x = element_blank(),
    axis.text.x = element_blank(),
    legend.position = "top",
    legend.text = element_text(size = 12)
  )

ggsave("reads_ratio_per_barcode_boxplot.pdf", width = 6, height = 5)

ggplot(resampled_metadat, aes(resampled, 
                umi_proportion, fill = resampled)) + 
  geom_boxplot(coef = Inf) + 
  facet_wrap(~library, nrow = 1) +
  xlab("Selected for Reamplification") +
  ylab(expression(paste( "Log"[2], " UMIs ", frac("resampled", "original")))) +
  scale_fill_manual(name = "resampled:",
                    values = color_palette) +
  theme_cowplot(font_size = 16, line_size = 0.5) +
  theme(
    axis.title.x = element_blank(),
    axis.text.x = element_blank(),
    legend.position = "top",
    legend.text = element_text(size = 12)
  )

ggsave("umi_ratio_per_barcode_boxplot.pdf", width = 6, height = 5)

ggplot(resampled_metadat, aes(resampled, 
                norm_read_proportion, fill = resampled)) + 
  geom_boxplot(coef = Inf) + 
  facet_wrap(~library, nrow = 1) +
  xlab("Selected for Reamplification") +
  ylab(expression(paste( "Log"[2], " normalized Reads ", frac("resampled", "original")))) +
  scale_fill_manual(name = "resampled:",
                    values = color_palette) +
  theme_cowplot(font_size = 16, line_size = 0.5) +
  theme(
    axis.title.x = element_blank(),
    axis.text.x = element_blank(),
    legend.position = "top",
    legend.text = element_text(size = 12)
  )

ggsave("norm_reads_ratio_per_barcode_boxplot.pdf", width = 6, height = 5)

ggplot(resampled_metadat, aes(resampled, 
                norm_umi_proportion, fill = resampled)) + 
  geom_boxplot(coef = Inf) + 
  facet_wrap(~library, nrow = 1) +
  xlab("Selected for Reamplification") +
  ylab(expression(paste( "Log"[2], " normalized UMIs ", frac("resampled", "original")))) +
  scale_fill_manual(name = "resampled:",
                    values = color_palette) +
  theme_cowplot(font_size = 16, line_size = 0.5) +
  theme(
    axis.title.x = element_blank(),
    axis.text.x = element_blank(),
    legend.position = "top",
    legend.text = element_text(size = 12)
  )

ggsave("norm_umi_ratio_per_barcode_boxplot.pdf", width = 6, height = 5)
dat <- group_by(resampled_metadat, library) %>%
  filter(library != reflib) %>% 
  mutate(total_new = sum(total_reads, na.rm = T), 
         total_old = sum(og_total_reads, na.rm = T))

dat_group <- group_by(dat, library, resampled) %>% 
  summarize(total_new = sum(total_reads, 
                            na.rm = T) / unique(total_new), 
            total_old = sum(og_total_reads, 
                            na.rm = T) / unique(total_old)) %>% 
  gather(lib, percent_lib, -library, -resampled ) %>%
  mutate(lib = factor(lib, levels = c("total_old", "total_new"), 
                      labels = c("original\nlibrary", "resampled\nlibrary")))

ggplot(dat_group, aes(lib, percent_lib, fill = resampled)) + 
  geom_bar(stat = "identity") + 
  ylab("Fraction of\n Reads Assigned") +
  scale_fill_manual(name = "resampled:",
                    values = color_palette) +
  facet_wrap(~library) + 
  theme_cowplot(font_size = 16, line_size = 1) +
  theme(
    axis.title.x = element_blank(),
    axis.text.x = element_text(angle = 90, hjust = 0.5, vjust = 0.5),
    legend.position = "top",
    legend.text = element_text(size = 16)
  )

ggsave("proportion_reads_all_barcode_barplot.pdf", width = 7, height = 7)

dat_group %>% 
 rename(Method = library) %>% 
  spread(lib, percent_lib) %>% 
  mutate(`Targeted Library Read Fold-Enrichment` = `resampled\nlibrary` / `original\nlibrary`) %>%
  filter(resampled == T) %>% 
  select(Method, `Targeted Library Read Fold-Enrichment`)

Genes detected

## compute per gene or per gene/umi combo enrichment
detected_molecules <- function(sc_obj, molecule = "gene"){
  umis <- sc_obj$umi_matrix
  if (molecule == "gene"){
    n_genes <- colSums(umis > 0)
    out_mdat <- data_frame(cell_id = colnames(umis),
      n_genes = n_genes)
    sc_obj <- add_metadata(sc_obj, out_mdat)
    }
}
sc_objs <- map(sc_objs, ~detected_molecules(.x))
resampled_metadat <- map(sc_objs, ~.x$meta_dat) %>% 
  bind_rows(.id = "library") %>% 
   mutate(library = factor(library, 
                           levels = libs))

og_genes <- filter(resampled_metadat, 
                   library == reflib) %>% 
  dplyr::select(cell_id, 
                og_genes = n_genes)

resampled_metadat <- left_join(resampled_metadat, 
                                og_genes,
                                by = "cell_id") %>% 
  filter(library != reflib)

plt <- ggplot(resampled_metadat, aes(og_genes, 
                               n_genes, colour = resampled)) + 
  geom_point(size = 0.5) + 
  ylab("Genes detected\n(resampled library)") +
  xlab("Genes detected\n(original library)") + 
  scale_colour_manual(name =  "resampled:", values = color_palette) +
  facet_wrap(~library, nrow = 1) + 
  guides(colour = guide_legend(override.aes = list(size = 3))) +
  theme(
    legend.position = "top",
    legend.text = element_text(size = 10)
  )
plt

save_plot("genes_detected_scatterplot.pdf", plt)

Parse out new versus previously identified genes

calc_gene_sensitivity <- function(sc_obj, 
                                  type = "umi"){
  
  if (type == "umi"){
    count_matrix <- sc_obj$umi_matrix
  } else {
    count_matrix <- sc_obj$read_matrix
  }
  # generate list named with barcode of each detected gene and 
  # respective read/umi count
  genes_detected <- apply(count_matrix, 2, function(x) x[x > 0])
  sc_obj$genes_detected <- genes_detected
  sc_obj
}

sc_objs <- map(sc_objs, calc_gene_sensitivity)
sc_objs <- map(sc_objs, 
           function(x){
             og_genes <- sc_objs[[reflib]]$genes_detected
             sub_genes <- x$genes_detected
             
             # subset list of cell barcodes to the same as the og experiment
             # and also reorders the barcodes to match
             sub_genes <- sub_genes[names(og_genes)]
             
             if(length(sub_genes) != length(og_genes)){
               stop("barcode lengths not the same")
             }
             shared_genes <- map2(sub_genes, 
                                  og_genes,
                                  ~intersect(names(.x),
                                             names(.y)))
             new_genes <- map2(sub_genes,
                               og_genes,
                               ~setdiff(names(.x),
                                        names(.y)))
             
             not_recovered_genes <- map2(og_genes,
                                         sub_genes,
                                         ~setdiff(names(.x),
                                                  names(.y)))
             x$shared_genes <- shared_genes
             x$new_genes <- new_genes
             x$not_recovered_genes <- not_recovered_genes
             return(x)
             })

## add gene recovery info to meta data table
sc_objs <- map(sc_objs, 
           function(x){
             shared_genes <- map2_dfr(x$shared_genes, 
                                names(x$shared_genes),
                                function(x, y){
                                  data_frame(cell_id = y,
                                             shared_genes = length(x))
                                 })
             
             not_recovered_genes <- map2_dfr(x$not_recovered_genes, 
                                names(x$not_recovered_genes),
                                function(x, y){
                                  data_frame(cell_id = y,
                                            not_recovered_genes = length(x))
                                 })
             
             new_genes <- map2_dfr(x$new_genes, 
                                names(x$new_genes),
                                function(x, y){
                                  data_frame(cell_id = y,
                                            new_genes = length(x))
                                 })
             gene_mdata <- left_join(shared_genes,
                                     not_recovered_genes,
                                     by = "cell_id") %>% 
               left_join(., new_genes, by = "cell_id")
             
             x <- add_metadata(x, gene_mdata)
             x
           })

resampled_metadat <- map(sc_objs, ~.x$meta_dat) %>% 
  bind_rows(.id = "library") %>% 
   mutate(library = factor(library, 
                          levels = libs))
genes_recovered <- resampled_metadat %>% 
  dplyr::filter(library != reflib) %>% 
  dplyr::select(cell_id, 
                library,
                resampled,
                shared_genes,
                not_recovered_genes, 
                new_genes)

genes_recovered <- gather(genes_recovered, 
                          key = type, value = count, 
                          -cell_id, -resampled, -library)
genes_recovered <- mutate(genes_recovered,
                          type = str_replace_all(type, "_", "\n"))

plt <- ggplot(genes_recovered, 
       aes(cell_id, count)) +
  geom_point(aes(color = resampled),
             size = 0.6,
             alpha = 0.75) +
  facet_grid(type ~ library) +
  theme(axis.text.x = element_blank(),
        axis.title.x = element_blank(),
        strip.text.y = element_text(size = 12,
                                    margin = margin(0,0.2,0,0.2, "cm"))) +
  scale_color_manual(values = color_palette)

plt 

save_plot("new_genes_detected.pdf", plt, base_width = 8, base_height = 8)

targeted <- genes_recovered %>% 
  dplyr::filter(library %in% resampled_libs,
                resampled) 
targeted
plt_dat <- genes_recovered %>% 
  dplyr::filter(!resampled) %>% 
  group_by(type) %>% 
  summarize(count = mean(count)) %>% 
  mutate(cell_id = "Not targeted\nbarcodes\n(mean)") %>% 
  bind_rows(targeted, .) %>% 
  mutate(type = ifelse(type == "new\ngenes",
                        "Newly\ndetected\ngenes",
                        ifelse(type == "shared\ngenes",
                               "Previously\ndetected\ngenes",
                               ifelse(type == "not\nrecovered\ngenes",
                                      "Previously\ndetected\ngenes\nnot recovered",
                                      NA))),
         cell_id = factor(cell_id, 
                          levels = c(str_c(cells[[resampled_lib]],
                                           "-1"), 
                          "Not targeted\nbarcodes\n(mean)")))


plt <- ggplot(plt_dat, 
       aes(cell_id, count)) +
  geom_bar(aes(fill = type),
           stat = "identity") +
  labs(y = "# of genes") + 
  scale_x_discrete(labels = cell_names) + 
  scale_y_continuous(labels = scales::comma) + 
  scale_fill_brewer(palette = "Set1", name = "") +
  guides(fill = guide_legend(override.aes = list(size = 0.25))) +
  theme(axis.title.x = element_blank(),
        axis.text.x = element_text(angle = 45,
                                   hjust = 1),
        legend.position = "top",
        plot.margin = unit(c(5.5, 50.5, 5.5, 5.5), 
                           "points"))
      #  legend.key.size = unit(0.25, "pt")) 

plt

save_plot("new_genes_barplot.pdf", plt, 
          base_width = 4, base_height = 5)
new_genes <- sc_objs$mkcell_pulldown$new_genes[str_c(cells$mkcell_pulldown,
                                                     "-1")]

all_genes <- unique(unlist(new_genes))

library(UpSetR)
pdf("new_genes_shared.pdf")
  upset(fromList(new_genes), order.by = "freq")
dev.off()
## quartz_off_screen 
##                 2
upset(fromList(new_genes), order.by = "freq")

Compare new genes to previously detected genes in the cluster

Plot expression per cell

MA plots

calc_ma <- function(xmat, ymat, cell = "GCAGTTAAGTGTCCAT"){
  x_rn <- rownames(xmat)
  y_rn <- rownames(ymat)
  xmat <- log2(xmat + 1)
  ymat <- log2(ymat + 1)
  
  rownames(xmat) <- x_rn
  rownames(ymat) <- y_rn
  m <- Matrix::rowMeans(log2(((2^ymat + 2^xmat) / 2) + 1))
  a <- xmat[, cell] - ymat[, cell]
  data_frame(gene = names(a),
             mean_expression_log2 = m,
             log2_diff = a)
}


genes_to_plot <- rownames(sc_objs[[resampled_lib]]$umi_matrix)
cols <- colnames(sc_objs[[resampled_lib]]$norm_umi)
cell_ids <- str_c(cells[[resampled_lib]], "-1")

## append genes to reference library if necessary
ref_mat <- standardize_rows(sc_objs[[resampled_lib]]$umi_matrix[, cols],
                            sc_objs[[reflib]]$umi_matrix[, cols])

ma_dat <- map(cell_ids,
    ~calc_ma(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot, cols], 
             ref_mat[genes_to_plot, cols],
             cell = .x))

names(ma_dat) <- cell_ids
ma_dat <- bind_rows(ma_dat, .id = "cell")

plot_ma <- function(df){
  n_up <- filter(df, log2_diff > 0) %>% 
    group_by(cell) %>% 
    summarize(n = n(), n = paste0("up = ", n))
  
  n_down <- filter(df, log2_diff < 0) %>% 
    group_by(cell) %>% 
    summarize(n = n(), n = paste0("down = ", n))
  
  if (nrow(n_down) == 0) {
    n_down = data_frame(cell = df$cell %>% unique(),
                        n = "down = 0")
  }
  plt <- ggplot(df,
         aes(mean_expression_log2,
             log2_diff)) +
    geom_hline(aes(yintercept = 0), linetype = "dashed", colour = "grey") + 
    geom_point(size = 0.25) +
    geom_text(data = n_up, aes(x = max(ma_dat$mean_expression_log2) * 0.9, 
                               y = max(ma_dat$log2_diff) * 1.2, 
                               label = n)) +
    geom_text(data = n_down, aes(x = max(ma_dat$mean_expression_log2) * 0.9, 
                                 y = min(ma_dat$log2_diff) * 1.2, 
                                 label = n)) + 
    facet_wrap(~cell, nrow = 1) +
    labs(x = expression(paste("Abundance (log"[2], ")")),
         y = expression(paste(frac("resampled","Original"), " (log"[2], ")")))
  plt
}

plt <- plot_ma(ma_dat)
plt

save_plot("per_cell_MA_plot_all_genes.pdf", plt, base_height = 6)

## Shared genes
ma_dat <- map(cell_ids,
  function(x){
    genes_to_plot <- sc_objs[[resampled_lib]]$shared_genes[[x]]
    calc_ma(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot,
                                                              cols],
            ref_mat[genes_to_plot, cols],
             cell = x)
})

names(ma_dat) <- cell_ids
ma_dat <- bind_rows(ma_dat, .id = "cell")
plt <- plot_ma(ma_dat)
plt

save_plot("per_cell_MA_plot_shared_genes.pdf", plt, base_height = 6)


## New genes
ma_dat <- map(cell_ids,
  function(x){
    genes_to_plot <- sc_objs[[resampled_lib]]$new_genes[[x]]
    calc_ma(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot,
                                                              cols],
            ref_mat[genes_to_plot, cols],
             cell = x)
})

names(ma_dat) <- cell_ids
ma_dat <- bind_rows(ma_dat, .id = "cell")
plt <- plot_ma(ma_dat)
plt

save_plot("per_cell_MA_plot_new_genes.pdf", plt, base_height = 6)

Histograms

get_expr <- function(xmat, ymat, cell = "GCAGTTAAGTGTCCAT"){
  xrows <- rownames(xmat)
  xmat <- log2(xmat[, cell] + 1)
  ymat <- log2(ymat[xrows, cell] + 1)
  data_frame(
    gene = xrows,
    resampled = xmat,
    original = ymat) %>% 
    gather(library, 
           Expression, -gene)
}

plot_histogram <- function(df){
  ggplot(df, 
         aes_string("Expression")) +
    geom_density(aes_string(fill = "library"),
                 alpha = 0.66) +
    scale_fill_viridis_d(name = "") +
    facet_wrap(~cell, nrow = 1) +
    theme(legend.position = "top",
          strip.text = element_text(size = 8)) 
}

expr_dat <- map(cell_ids,
  function(x){
    genes_to_plot <- names(sc_objs[[resampled_lib]]$genes_detected[[x]])
    get_expr(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot,
                                                              cols],
            ref_mat[genes_to_plot, cols],
             cell = x)
})

names(expr_dat) <- cell_ids
expr_dat <- bind_rows(expr_dat, .id = "cell")
expressed_in_resampled_plt <- plot_histogram(expr_dat)
expressed_in_resampled_plt

expr_dat <- map(cell_ids,
  function(x){
    genes_to_plot <- sc_objs[[resampled_lib]]$shared_genes[[x]]
    get_expr(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot,
                                                              cols],
            ref_mat[genes_to_plot, cols],
             cell = x)
})

names(expr_dat) <- cell_ids
expr_dat <- bind_rows(expr_dat, .id = "cell")
share_genes_plt <- plot_histogram(expr_dat)
share_genes_plt

expr_dat <- map(cell_ids,
  function(x){
    genes_to_plot <- sc_objs[[resampled_lib]]$new_genes[[x]]
    get_expr(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot,
                                                              cols],
            ref_mat[genes_to_plot, cols],
             cell = x)
})

names(expr_dat) <- cell_ids
expr_dat <- bind_rows(expr_dat, .id = "cell")
new_gene_plt <- plot_histogram(expr_dat)


plts <- list(
  expressed_in_resampled_plt,
  share_genes_plt,
  new_gene_plt
)

plt <- plot_grid(plotlist = plts, ncol = 1)
plt

save_plot("expression_histograms.pdf", plt, 
          ncol = 1, nrow = 3,
          base_height = 4,
          base_aspect_ratio = 2)

scatterplots

get_paired_expr <- function(xmat, ymat, cell = "GCAGTTAAGTGTCCAT"){
  xrows <- rownames(xmat)
  xmat <- log2(xmat[, cell] + 1)
  ymat <- log2(ymat[xrows, cell] + 1)
  data_frame(
    gene = xrows,
    resampled = xmat,
    original = ymat)
}

plot_scatter <- function(df){
  ggplot(df, 
         aes_string("original", "resampled")) +
    geom_point(size = 0.5) + 
    geom_abline(aes(slope = 1, intercept = 0)) +
    facet_wrap(~cell, nrow = 1) +
    coord_fixed() + 
    theme(legend.position = "top",
          strip.text = element_text(size = 8)) 
}

expr_dat <- map(cell_ids,
  function(x){
    genes_to_plot <- names(sc_objs[[resampled_lib]]$genes_detected[[x]])
    get_paired_expr(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot,
                                                              cols],
            ref_mat[genes_to_plot, cols],
             cell = x)
})

names(expr_dat) <- cell_ids
expr_dat <- bind_rows(expr_dat, .id = "cell")
expressed_in_resampled_plt <- plot_scatter(expr_dat)
expressed_in_resampled_plt

expr_dat <- map(cell_ids,
  function(x){
    genes_to_plot <- sc_objs[[resampled_lib]]$shared_genes[[x]]
    get_paired_expr(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot,
                                                              cols],
            ref_mat[genes_to_plot, cols],
             cell = x)
})

names(expr_dat) <- cell_ids
expr_dat <- bind_rows(expr_dat, .id = "cell")
share_genes_plt <- plot_scatter(expr_dat)
share_genes_plt

expr_dat <- map(cell_ids,
  function(x){
    genes_to_plot <- sc_objs[[resampled_lib]]$new_genes[[x]]
    get_paired_expr(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot,
                                                              cols],
            ref_mat[genes_to_plot, cols],
             cell = x)
})

names(expr_dat) <- cell_ids
expr_dat <- bind_rows(expr_dat, .id = "cell")
new_gene_plt <- ggplot(expr_dat, aes(cell,resampled)) +
  geom_jitter(alpha = 0.55) +
  geom_violin(aes(fill = cell)) +
  ylim(0, max(expr_dat$resampled) * 1.10) + 
  scale_fill_brewer(palette = "Set1") +
    theme(axis.text.x = element_text(angle = 90, 
                                     hjust = 1, vjust = 0.5),
          legend.pos = "none",
          axis.title.x = element_blank()) 

new_gene_plt

plts <- list(
  expressed_in_resampled_plt,
  share_genes_plt
)

plt <- plot_grid(plotlist = plts, ncol = 1)
plt

save_plot("expression_scatterplots.pdf", plt, 
          ncol = 1, nrow = 3,
          base_height = 4,
          base_aspect_ratio = 2)

save_plot("expression_newgenes_violinplots.pdf", new_gene_plt, 
          base_height = 6,
          base_aspect_ratio = 0.5)

UMIs detected

Parse out new verses old UMIs

compare_umis <- function(path_to_ctrl,
                         path_to_test,
                         return_summary = F,
                         cells_exclude = "Cell_unmatched"){

  ## umi seqs should be produced by ./get_molecule_info
  ctrl_umi_seqs <- read_tsv(path_to_ctrl,
                   col_names = c("barcode_10x", 
                                 "umi_molecule", 
                                 "count")) %>% 
  filter(!barcode_10x %in% cells_exclude)

  test_umi_seqs <- read_tsv(path_to_test,
                   col_names = c("barcode_10x", 
                                 "umi_molecule", 
                                 "count")) %>% 
  filter(!barcode_10x %in% cells_exclude)

  umi_seqs <- full_join(ctrl_umi_seqs, 
          test_umi_seqs, 
          by = c("barcode_10x", "umi_molecule"))
  
  if (return_summary) {
    umi_seqs %>% 
      mutate(new_umi = ifelse(is.na(count.x) & !is.na(count.y), 
                          1L, 
                          0L),
         not_detected_umi = ifelse(!is.na(count.x) & is.na(count.y),
                                   1L,
                                   0L),
         shared_umi = ifelse(!is.na(count.x) & !is.na(count.y),
                             1L,
                             0L)) %>% 
      group_by(barcode_10x) %>% 
      summarize(new_umis = sum(new_umi),
            not_detected_umis = sum(not_detected_umi),
            shared_umis = sum(shared_umi))
  } else {
    umi_seqs
  }
}

umi_files <- file.path(data_dir, libs, "umis", "umigroups.txt.gz")

umi_summaries <- map(umi_files[2],
                   ~compare_umis(umi_files[1], .x, return_summary = T))

names(umi_summaries) <- umi_files[2] %>% 
  str_split(., "/") %>% 
  map_chr(~.x[7])

umi_summary <- bind_rows(umi_summaries, .id = "library")

umis_recovered <- umi_summary %>% 
  gather(class, count, -barcode_10x, -library) 

## annotate with resampled or not
cell_annot <- data_frame(barcode_10x = cells,
                         library = libs,
                         resampled = T) %>% 
  unnest()

umis_recovered <- umis_recovered %>% 
  mutate(resampled = ifelse(str_replace(barcode_10x, "-1", "") %in% unlist(cells),
                             T,
                             F)) %>% 
  arrange(resampled)

plt <- ggplot(umis_recovered, 
       aes(barcode_10x, count)) +
  geom_point(aes(colour = resampled),
             size = 0.6,
             alpha = 0.75) +
  facet_grid(library ~ class) +
  theme(axis.text.x = element_blank(),
        axis.title.x = element_blank(),
        strip.text.y = element_text(size = 12,
                                    margin = margin(0,0.2,0,0.2, "cm"))) +
  scale_color_manual(values = color_palette)

plt 

umi_seqs <- map(umi_files[2],
                ~compare_umis(umi_files[1], .x, return_summary = F))

names(umi_seqs) <- umi_files[2] %>% 
  str_split(., "/") %>% 
  map_chr(~.x[7])

new_umis <- map(umi_seqs, 
    ~filter(.x, 
       str_replace(barcode_10x, "-1", "") %in% unlist(cells), 
       !is.na(count.y), 
       is.na(count.x))  %>% 
  separate(umi_molecule, c("seq", "gene"), sep = "::") %>% 
    select(-starts_with("count"))) 

old_umis <- map(umi_seqs, 
    ~filter(.x, 
       str_replace(barcode_10x, "-1", "") %in% unlist(cells), 
       !is.na(count.x),
       !is.na(count.y))  %>% 
  separate(umi_molecule, c("seq", "gene"), sep = "::") %>% 
    select(-starts_with("count"))) 


umi_edit_dist <- map2(new_umis,
                      old_umis,
                     ~left_join(.x, .y, 
               by = c("barcode_10x", "gene")) %>% 
  na.omit() %>% 
  mutate(ed = kentr::get_hamming_pairs(seq.x, seq.y)) %>% 
  group_by(barcode_10x, seq.y, gene) %>% 
  summarize(min_ed = as.integer(min(ed))) %>% 
  ungroup())

umi_edit_dist <- bind_rows(umi_edit_dist, 
                           .id = "library")
umi_edit_dist <-  mutate(umi_edit_dist, 
                          barcode_10x = factor(barcode_10x, 
                              levels = str_c(cells[[resampled_lib]], "-1")))

plt <- ggplot(umi_edit_dist, aes(barcode_10x, 
                          min_ed)) + 
  geom_boxplot(coef = Inf) +
  scale_x_discrete(labels = cell_names) +
  facet_wrap(~library, scales = "free_x", 
             labeller = labeller(library = lib_names)) +
  scale_y_continuous(breaks= scales::pretty_breaks()) +
  labs(y = "Minimum edit distance\noriginal vs. new UMIs") +
  theme(axis.text.x = element_text(angle = 90,
                                   hjust = 1, 
                                   vjust = 0.5),
        axis.title.x = element_blank())

save_plot("umi_edit_distance.pdf", plt)
saveRDS(sc_objs, "sc_objs.rds")

tSNE analysis

original library tSNE

library(Seurat)

mat <- sc_objs[[reflib]]$umi_matrix
sobj <- CreateSeuratObject(mat, min.genes = 200)
sobj <- NormalizeData(sobj)

sobj <- ScaleData(sobj)
sobj <- FindVariableGenes(sobj, do.plot = T, y.cutoff = 0.25)

sobj <- RunPCA(sobj, pc.genes = rownames(sobj@data), 
               pcs.compute = 20, 
               do.print = F, seed.use = 20180525)
sobj <- RunTSNE(sobj, dims.use = 1:15, seed.use = 20180525)
sobj <- FindClusters(sobj,
                     dims.use = 1:15, 
                     k.param = 15,
                     resolution = 1.2, 
                     print.output = F, 
                     random.seed = 20180525)
plt <- TSNEPlot(sobj, 
                colors.use = c(brewer.pal(12, "Paired"), 
                               brewer.pal(9, "Set1")),
                do.label = T) + 
  labs(title = "PBMCs") +
  theme(legend.position = "none")

plt

save_plot("original_mkcells_tsne.pdf", plt, 
          base_height = 4.25, base_width = 4.25)

cell_mdata <- sobj@meta.data %>% 
  tibble::rownames_to_column("cell") %>% 
  mutate(resampled = ifelse(cell %in% str_c(cells$mkcell_pulldown, "-1"), 
                             T,
                             F)) %>% 
  select(cell, resampled) %>% 
  as.data.frame() %>% 
  tibble::column_to_rownames("cell") 

sobj <- AddMetaData(sobj, cell_mdata, col.name = "resampled")

sobj <- SetAllIdent(sobj, "resampled")

plt <- TSNEPlot(sobj, 
                colors.use = c(brewer.pal(12, "Paired"), 
                               brewer.pal(9, "Set1")),
                do.label = F, 
                pt.size = 0.5) + 
  theme(legend.position = "none")

plt

save_plot("original_selected_mkcells_tsne.pdf", plt, 
          base_height = 4.25, base_width = 4.25)
immune_markers <- c(t_cells = "CD3E", 
                    cd8_t ="CD8A", 
                    cytotoxic_t = "NKG7", 
                    dendritic = "FCER1A", 
                    megakaryocyte = "PF4", 
                    b_cell = "CD79A", 
                    cd4_4 = "IL7R", 
                    monocyte = "CD14",
                    RBC = "HBB",
                    NK = "NCAM1",
                    pDC = "LILRA4",
                    monocyte = "FCGR3A")

plts <- map(immune_markers, ~plot_feature(sobj, gene = .x, 
                                          pt.alpha = 1))
plt <- plot_grid(plotlist = plts,  nrow = 4)
save_plot("og_tsne_markers.pdf", plt,
          nrow = 4, ncol = 3)

sobj <- SetAllIdent(sobj, "res.1.2")

new_ids <- c(
  "0" = "CD4+ T-Cells",
  "1" = "CD8+ T-Cells",
  "2" = "CD4+ T-Cells",
  "3" = "CD8+ T-Cells",
  "4" = "CD14+ Monocytes",
  "5" = "CD8+ T-Cells",
  "6" = "CD14+ Monocytes",
  "7" = "FCGR3A+ Monocytes",
  "8" = "CD14+ Monocytes",
  "9" = "CD14+ Monocytes",
  "10" = "Dendritic",
  "11" = "CD4+ T-Cells",
  "12" = "B-Cells",
  "13" = "NK Cells",
  "14" = "Megakaryocytes",
  "15" = "CD8+ T-Cells",
  "16" = "Plasmacytoid dendritic"
)
old_ids <- sobj@meta.data %>% 
  rownames_to_column("cell") %>% 
  pull("res.1.2")

new_labels <- new_ids[old_ids] %>% unname()
new_df <- data.frame(row.names = rownames(sobj@meta.data),
                     cell_labels = new_labels)
sobj <- AddMetaData(sobj, new_df)
cell_percents <- group_by(sobj@meta.data, cell_labels) %>% 
  summarize(n_cells = n()) %>% 
  mutate(total_cells = sum(n_cells), 
         percentage = 100 * (n_cells / total_cells)) %>% 
  select(-total_cells)

cell_percents
# find region around the megakaryocyte cells to highlight
tsne_dat <- GetCellEmbeddings(sobj, "tsne") %>% 
  as.data.frame() %>% 
  tibble::rownames_to_column("cell") %>% 
  left_join(sobj@meta.data %>%
              as.data.frame() %>% 
              tibble::rownames_to_column("cell"),
            ., by = "cell") %>% 
  left_join(., cell_percents, by = "cell_labels") %>% 
  mutate(cell_labels_annotated = str_c(cell_labels, 
                            " (",
                            signif(percentage, 3),
                            "%)"))
dat <- filter(tsne_dat, cell_labels == "Megakaryocytes")

x_min = min(dat[, c("tSNE_1")]) - 1.5
x_max = max(dat[, c("tSNE_1")]) + 1.5
y_min = min(dat[, c("tSNE_2")]) - 1.5
y_max = max(dat[, c("tSNE_2")]) + 1.5

labeled_plt <- ggplot(tsne_dat, 
  aes(tSNE_1, tSNE_2)) + 
  geom_point(aes(color = cell_labels_annotated), size = 0.1) + 
  scale_color_manual(values = brewer.pal(10, "Paired"),
                     name = "") +
  labs(title = "",
       x = "tSNE 1",
       y = "tSNE 2") +
  geom_rect(aes(xmin = x_min,
                xmax = x_max,
                ymin = y_min,
                ymax = y_max), alpha = 0, color = "black", 
            size = 0.5) +
   theme_cowplot() + 
  guides(color = guide_legend(override.aes = list(size = 4))) + 
  theme(legend.pos = "right",
        aspect.ratio = 1) 


sub_plt <- ggplot(arrange(tsne_dat, resampled), 
  aes(tSNE_1, tSNE_2)) + 
  geom_point(aes(color = resampled), size = 2) + 
  scale_color_manual(values = c(brewer.pal(10, 
                                           "Paired")[7],
                                color_palette[2]),
                     name = "Resampled") +
  coord_cartesian(xlim = c(x_min, x_max),
                  ylim = c(y_min, y_max)) +
  guides(color = guide_legend(override.aes = list(size = 5))) + 
  labs(x = "tSNE 1",
       y = "tSNE 2")  +
  theme(aspect.ratio = 1)

plt <- plot_grid(labeled_plt, sub_plt, align = 'hv', 
                 nrow = 1)
save_plot("original_pbmc_annotated_tsne.pdf", plt, 
          base_width = 5.5, 
          base_aspect_ratio = 1.5, nrow = 1, ncol = 2)
saveRDS(sobj, "og_sobj.rds")
if (!file.exists("original_pbmc_markers.txt")){
  sobj <- SetAllIdent(sobj, "cell_labels")
  all_markers <- FindAllMarkers(sobj)
  write_tsv(all_markers, "original_pbmc_markers.txt")
}

all_markers <- read_tsv("original_pbmc_markers.txt")
cell_mdata <- sobj@meta.data %>% 
  tibble::rownames_to_column("cell") 

original library tSNE supplemented with resampled barcodes

mat <- sc_objs[[reflib]]$umi_matrix

resampled_ids <- sc_objs[[resampled_libs]]$meta_dat %>% 
  filter(resampled) %>% 
  pull(cell_id)

resampled_mat <- sc_objs[[resampled_libs]]$umi_matrix[, resampled_ids]
colnames(resampled_mat) <- str_c(colnames(resampled_mat), 
                                  "::", "resampled")

mat <- as.data.frame(as.matrix(mat)) %>% rownames_to_column("gene")
resampled_mat <- as.data.frame(as.matrix(resampled_mat)) %>% rownames_to_column("gene")

combined_mats <- left_join(mat, resampled_mat, by = c("gene")) 
combined_mats <- as.data.frame(combined_mats) %>% 
  column_to_rownames("gene") %>% 
  as.matrix() %>% 
  as(., "sparseMatrix")   

combined_mats[is.na(combined_mats)] <- 0

sobj <- CreateSeuratObject(combined_mats, min.genes = 200)

new_ids <- sobj@meta.data %>% 
  rownames_to_column("cell") %>% 
  mutate(resampled = ifelse(str_detect(cell, "resampled"),
                             "resampled",
                             "not resampled"))

resampled_cell_ids <- new_ids[new_ids$resampled == "resampled", 
                              "cell"] %>% 
  str_replace("::resampled", "")
 
new_ids <- mutate(new_ids, 
                  resampled = ifelse(cell %in% resampled_cell_ids, 
                                      "original cell",
                                      resampled)) %>% 
  select(cell, resampled) %>% 
  as.data.frame(.) %>% 
  column_to_rownames("cell")

sobj <- AddMetaData(sobj, new_ids)
sobj <- NormalizeData(sobj)
sobj <- ScaleData(sobj)
sobj <- FindVariableGenes(sobj, do.plot = F, y.cutoff = 0.25)
sobj <- RunPCA(sobj, pc.genes = rownames(sobj@data), 
               pcs.compute = 20, 
               do.print = F, seed.use = 20180605)
sobj <- RunTSNE(sobj, dims.use = 1:15, seed.use = 20180605)
sobj <- FindClusters(sobj,
                     dims.use = 1:15, 
                     resolution = 1.2, 
                     print.output = F, 
                     random.seed = 20180605)
plt <- TSNEPlot(sobj, 
                colors.use = c(brewer.pal(12, "Paired"), 
                               brewer.pal(9, "Set1")),
                do.label = T) + 
  theme(legend.position = "none")

plt

save_plot("original_with_resampled_mkcells_tsne.pdf", plt, 
          base_height = 4.25, base_width = 4.25)
### Plot original and resampled close-up
pf4 <- plot_feature(sobj, gene = "PF4", pt.alpha = 1, pt.size = 0.1) +
  labs(title = "",
       x = "tSNE 1",
       y = "tSNE 2") + 
  theme_cowplot() +
  theme(legend.pos = "right")

# find region around the resampled cells to highlight

tsne_dat <- GetCellEmbeddings(sobj, "tsne") %>% 
  as.data.frame() %>% 
  tibble::rownames_to_column("cell") %>% 
  left_join(sobj@meta.data %>%
              as.data.frame() %>% 
              tibble::rownames_to_column("cell"),
            ., by = "cell")

og_dat <- filter(tsne_dat, resampled == "original cell")

sub_dat <-  filter(tsne_dat, resampled == "resampled") %>% 
  mutate(cell = str_replace(cell, "::resampled", "")) %>% 
  select(cell, tSNE_1_resampled = tSNE_1, tSNE_2_resampled = tSNE_2)

dat <- left_join(og_dat, sub_dat, by = "cell")

xpos <- colMeans(as.matrix(dat[, c("tSNE_1", "tSNE_2")]) )[1]
ypos <- colMeans(as.matrix(dat[, c("tSNE_1", "tSNE_2")]) )[2]
x_min = xpos - 1.5
x_max = xpos + 1.5
y_min = ypos - 1.5
y_max = ypos + 1.5

resampled_cells <- ggplot(tsne_dat, 
  aes(tSNE_1, tSNE_2)) + 
  geom_point(aes(color = resampled), size = 0.1) + 
  scale_color_manual(values = c("lightgrey",
                                            brewer.pal(7,
                                                       "Set1")[1:2]),
                     name = "",
                     labels = c("not resampled" = "Not Resampled",
                                "original cell" = "Original Cell",
                                "resampled" = "Resampled Cell")) +
  geom_rect(aes(xmin = x_min,
                xmax = x_max,
                ymin = y_min,
                ymax = y_max), alpha = 0, color = "black",
            size = 0.5) +
  labs(x = "tSNE 1", y = "tSNE 2") +
  theme_cowplot() + 
  guides(ncol = 1, 
         color = guide_legend(override.aes = list(size = 4))) + 
  theme(legend.pos = "right") 


close_up <- ggplot(tsne_dat, 
  aes(tSNE_1, tSNE_2)) + 
  geom_point(aes(color = resampled), size = 2) + 
  scale_color_manual(values = c("lightgrey",
                                            brewer.pal(7,
                                                       "Set1")[1:2]),
                     name = "",
                     labels = c("not resampled" = "Not Resampled",
                                "original cell" = "Original Cell",
                                "resampled" = "Resampled Cell")) +
   coord_cartesian(xlim = c(x_min, x_max),
                  ylim = c(y_min, y_max)) +
  geom_segment(data = dat, 
                               aes(x = tSNE_1,
                                   y = tSNE_2, 
                                   xend = tSNE_1_resampled,
                                   yend = tSNE_2_resampled),
                               linejoin = "mitre",
                               arrow = arrow(length = unit(0.02,
                                                           "npc"))) + 
   labs(x = "tSNE 1", y = "tSNE 2") +
  theme_cowplot() + 
  guides(ncol = 1, 
         color = guide_legend(override.aes = list(size = 4))) + 
  theme(legend.pos = "right")
  
plt <- plot_grid(pf4, resampled_cells, 
          close_up, nrow = 1, rel_widths = c(0.85, 1, 1))

save_plot("resampled_vs_original_closeup.pdf", 
          plt, nrow = 1, ncol = 3, base_aspect_ratio = 1.2)
saveRDS(sobj, "rs_sobj.rds")

kNN analysis

Find the k-nearest neighbors in PCA space

## use combined data from above
data.use <- GetCellEmbeddings(object = sobj,
                              reduction.type = "pca",
                              dims.use = 1:20)

## findnearest neighboors using exact search
knn <- RANN::nn2(data.use, k = 5,
                 searchtype = 'standard',
                 eps = 0)

resampled_idxs <- knn$nn.idx[str_detect(rownames(data.use),
                                         "::resampled"), ]

nn_ids <- as_data_frame(t(apply(resampled_idxs, 1,
                      function(x)rownames(data.use)[x])))

colnames(nn_ids) <- c("query_cell", 
                      paste0("nearest neighbor ", 
                             1:(ncol(nn_ids) - 1)))

nn_ids

Heatmap analysis

Plot average megakaryocyte expression, the original megakaroytes, and the resampled megakaryocytes. Plot markers of megakaryocytes, defined above using the original data.

og_sobj <- readRDS("og_sobj.rds")

Combined resampled and original cells

Calculate markers for MKs after merging resampling expression into original cells.

mat <- sc_objs[[reflib]]$umi_matrix

resampled_mat <- sc_objs[[resampled_lib]]$umi_matrix[,
                                                str_c(cells[[resampled_lib]], "-1")]

not_resampled_cell_ids <- colnames(mat)[!colnames(mat) %in% str_c(cells[[resampled_lib]],
                                                                  "-1")]
no_resampled_mat <- mat[, not_resampled_cell_ids]

## add additional genes (original)
new_genes <- setdiff(rownames(resampled_mat), rownames(no_resampled_mat))
zero_mat <- matrix(0L, 
                   ncol = ncol(no_resampled_mat), 
                   nrow = length(new_genes),
                   dimnames = list(new_genes, colnames(no_resampled_mat)))
no_resampled_mat <- rbind(no_resampled_mat, zero_mat)

## add additional genes (resampled)
new_genes <- setdiff(rownames(no_resampled_mat), rownames(resampled_mat))
zero_mat <- matrix(0L, 
                   ncol = ncol(resampled_mat), 
                   nrow = length(new_genes),
                   dimnames = list(new_genes, colnames(resampled_mat)))
resampled_mat <- rbind(resampled_mat, 
      zero_mat)
## match original matrix roworder
resampled_mat <- resampled_mat[rownames(no_resampled_mat), ]

combined_mat <- cbind(no_resampled_mat, resampled_mat)

sobj <- CreateSeuratObject(combined_mat, min.genes = 200)

new_ids <- sobj@meta.data %>% 
  rownames_to_column("cell") %>% 
  mutate(resampled = ifelse(cell %in% str_c(cells[[resampled_lib]], "-1"),
                             "resampled",
                             "not resampled"))

resampled_cell_ids <- new_ids[new_ids$resampled == "resampled", 
                               "cell"] %>% 
  str_replace("::resampled", "")
 
new_ids <- mutate(new_ids, 
                  resampled = ifelse(cell %in% resampled_cell_ids, 
                                      "original cell",
                                      resampled)) %>% 
  select(cell, resampled) %>% 
  as.data.frame(.) %>% 
  column_to_rownames("cell")

sobj <- AddMetaData(sobj, new_ids)

cell_ids <- select(cell_mdata, cell, cell_labels) %>% 
  as.data.frame() %>% 
  tibble::column_to_rownames("cell")

sobj <- AddMetaData(sobj, cell_ids)
sobj <- NormalizeData(sobj)
sobj <- ScaleData(sobj)
sobj <- SetAllIdent(sobj,
                    "cell_labels") 
new_markers <- FindMarkers(sobj, 
                           ident.1 = "Megakaryocytes")

new_mk_markers <- tibble::rownames_to_column(new_markers, "gene") %>% 
  filter(p_val_adj < 0.01) %>% 
  tbl_df()

mk_markers <- read_tsv("original_pbmc_markers.txt") %>% 
  filter(cluster == "Megakaryocytes", p_val_adj < 0.01)

shared_mk_markers <- inner_join(new_mk_markers,
                                mk_markers, by = "gene",
                                suffix = c("_new", "_old"))

ggplot(shared_mk_markers,
       aes(p_val_old, p_val_new)) +
  geom_point() +
  scale_x_log10() +
  scale_y_log10() +
  geom_abline()

ggplot(shared_mk_markers,
       aes(avg_logFC_old, avg_logFC_new)) +
  geom_point() + 
  geom_abline()

plot_expr_raw <- data_frame(og = og_sobj@raw.data[new_mk_markers$gene,
                                              str_c(cells$mkcell_pulldown, "-1")[1]],
                        resampled = sobj@raw.data[new_mk_markers$gene,
                                                  str_c(cells$mkcell_pulldown, "-1")[1]])

ggplot(plot_expr_raw, 
       aes(og, resampled)) + 
  geom_point(size = 0.5) +
  geom_abline()

plot_expr_norm <- data_frame(og = og_sobj@data[new_mk_markers$gene,
                                              str_c(cells$mkcell_pulldown, "-1")[1]],
                        resampled = sobj@data[new_mk_markers$gene,
                                                  str_c(cells$mkcell_pulldown,
                                                        "-1")[1]])

ggplot(plot_expr_norm, 
       aes(og, resampled)) + 
  geom_point(size = 0.5) +
  geom_abline()

information about genes recovered

mk_markers <- filter(all_markers, 
                     cluster == "Megakaryocytes", 
                     p_val_adj < 0.01)

genes_to_plot <- rownames(sc_objs[[resampled_lib]]$umi_matrix)
cols <- colnames(sc_objs[[resampled_lib]]$norm_umi)
cell_ids <- str_c(cells[[resampled_lib]], "-1")

## append genes to reference library if necessary
ref_mat <- standardize_rows(sc_objs[[resampled_lib]]$umi_matrix[,
                                                                 cols],
                            sc_objs[[reflib]]$umi_matrix[, cols])

ma_dat <- map(cell_ids,
    ~calc_ma(sc_objs[[resampled_lib]]$umi_matrix[genes_to_plot, cols], 
             ref_mat[genes_to_plot, cols],
             cell = .x))

names(ma_dat) <- cell_ids
ma_dat <- bind_rows(ma_dat, .id = "cell") %>% 
  mutate(cell = factor(cell,
                       levels = str_c(cells[[resampled_lib]],
                                      "-1")))

plot_ma <- function(df, markers){
  
  df <- mutate(df, 
               marker = ifelse(gene %in% markers,
                                  "Megakaryocyte marker gene",
                                  "Other gene"))
  n_up <- filter(df, log2_diff > 0) %>% 
    group_by(cell) %>% 
    summarize(n = n(), n = paste0("up = ", n))
  
  n_down <- filter(df, log2_diff < 0) %>% 
    group_by(cell) %>% 
    summarize(n = n(), n = paste0("down = ", n))
  
  if (nrow(n_down) == 0) {
    n_down = data_frame(cell = df$cell %>% unique(),
                        n = "down = 0")
  }
  plt <- ggplot(df,
         aes(mean_expression_log2,
             log2_diff)) +
    geom_hline(aes(yintercept = 0), linetype = "dashed", colour = "grey") + 
    geom_point(aes(color = marker), size = 0.25) +
    geom_text(data = n_up, aes(x = max(ma_dat$mean_expression_log2) * 0.7, 
                               y = max(ma_dat$log2_diff) * 1.2, 
                               label = n)) +
    geom_text(data = n_down, aes(x = max(ma_dat$mean_expression_log2) * 0.7, 
                                 y = min(ma_dat$log2_diff) * 1.2, 
                                 label = n)) + 
    facet_wrap(~cell, nrow = 1, 
               labeller = labeller(cell = cell_names)) +
    labs(x = expression(paste("Abundance (log"[2], ")")),
         y = expression(paste(frac("resampled","Original"), 
                              " (log"[2], ")"))) +
    scale_color_brewer(palette = "Set1", name = "") +
    guides(colour = guide_legend(override.aes = list(size = 4))) + 
    theme(legend.position = "top")
  plt
}

plt <- plot_ma(ma_dat, mk_markers$gene)
plt

save_plot("per_cell_MA_mk_markers.pdf", plt, 
          base_height = 4, base_aspect_ratio = 2.5)

How much overlap with known MK markers

The 4 resampled cells have higher numbers of genes detected, but adding just these 4 cells is not sufficent to increase the number of marker genes discovered for the MK cluster. This is likely due to there only being 4 cells out of 70 that are resampled. More cells would be necessary for this task.

To determine if a subset of the new genes detected in the 4 cells are actually markers of MKs I instead compared these genes to genes identified in another PBMC experiment. 10x genomics has a scRNA-Seq dataset with 68k cells, which has > 200 MK cells (compared to our 70). I computed markers for these cells and will use this dataset as a reference for MK cell markers, in addition to the markers originally identified in the kirkpatrick expt.

Shared marker genes

sc_objs <- readRDS("sc_objs.rds")
mk_10x_markers <- read_tsv(file.path(results_dir, 
                                      "2018-06-01",
                                      "mk_markers_pbmc68k.txt")) %>% 
  filter(p_val_adj < 0.01)

new_genes <- sc_objs$mkcell_pulldown$new_genes[str_c(cells$mkcell_pulldown,
                                                     "-1")]


names(new_genes) <-  cell_names[names(new_genes)]

pdf("newgenes_shared_in_pulldown_cells.pdf")
  upset(fromList(new_genes), order.by = "freq")
dev.off()
## quartz_off_screen 
##                 2
all_detected_genes <- sc_objs$mkcell_pulldown$genes_detected[str_c(cells$mkcell_pulldown,
                                                     "-1")] %>% 
  map(., names)

names(all_detected_genes) <-  cell_names[names(all_detected_genes)]

pdf("genes_shared_in_pulldown_cells.pdf")
  upset(fromList(all_detected_genes), order.by = "freq")
dev.off()
## quartz_off_screen 
##                 2
all_detected_genes <- sc_objs$kirkpatrick$genes_detected[str_c(cells$mkcell_pulldown,
                                                     "-1")] %>% 
  map(., names)

names(all_detected_genes) <-  cell_names[names(all_detected_genes)]

pdf("genes_shared_in_original_cells.pdf")
  upset(fromList(all_detected_genes), order.by = "freq")
dev.off()
## quartz_off_screen 
##                 2

compare to marker genes

sc_objs <- readRDS("sc_objs.rds")
mk_10x_markers <- read_tsv(file.path(results_dir, 
                                      "2018-06-01",
                                      "mk_markers_pbmc68k.txt")) %>% 
  filter(p_val_adj < 0.01)

new_genes <- sc_objs$mkcell_pulldown$new_genes[str_c(cells$mkcell_pulldown,
                                                     "-1")]
genes_per_cell <- map(sc_objs,
                  ~.x$genes_detected[str_c(cells$mkcell_pulldown,
                                                     "-1")])

all_genes <- map(genes_per_cell, ~map(.x, names))

summary_df <- map(all_genes,
               ~map(.x,
                    ~data_frame(n_markers = sum(.x %in%
                                             mk_10x_markers$gene),
                           total_genes = length(.x))))
summary_df <- map(summary_df, bind_rows, .id = "cell") %>% 
  bind_rows(., .id = "library")


plt <- ggplot(summary_df,
       aes(cell, n_markers)) +
  geom_bar(aes(fill = library), 
           stat = "identity",
           position = "dodge") +
  scale_x_discrete(labels = cell_names) + 
  scale_fill_brewer(palette = "Set1",
                    name = "",
                    labels = lib_names) +
  labs(y = "# of Megakaryocyte\nmarker genes") +
  guides(fill = guide_legend(nrow = 2)) + 
  theme(axis.text.x = element_text(angle = 45,
                                   hjust = 1),
        axis.title.x = element_blank(),
        legend.pos = "top")
  
save_plot("nmarkers_10x_pbmcs.pdf", 
          plt,
          base_aspect_ratio = 0.75)

og_mk_markers <- read_tsv(file.path("original_pbmc_markers.txt")) %>% 
  filter(cluster == "Megakaryocytes",
         p_val_adj < 0.01)

summary_df <- map(all_genes,
               ~map(.x,
                    ~data_frame(n_markers = sum(.x %in%
                                             og_mk_markers$gene),
                           total_genes = length(.x))))

summary_df <- map(summary_df, bind_rows, .id = "cell") %>% 
  bind_rows(., .id = "library") %>% 
  mutate(cell = factor(cell, 
                       levels = str_c(cells[[resampled_lib]],
                                      "-1")))

plt <- ggplot(summary_df,
       aes(cell, n_markers)) +
  geom_bar(aes(fill = library), 
           stat = "identity",
           position = "dodge") +
  scale_x_discrete(labels = cell_names) + 
  scale_fill_brewer(palette = "Set1",
                    name = "",
                    labels = lib_names) +
  labs(y = "# of Megakaryocyte\nmarker genes") +
  guides(fill = guide_legend(nrow = 2)) + 
  theme(axis.text.x = element_text(angle = 45,
                                   hjust = 1),
        axis.title.x = element_blank(),
        legend.pos = "top")
  
save_plot("nmarkers_kirkpatrick_pbmcs.pdf", 
          plt,
          base_aspect_ratio = 0.75)
plt

compare to all genes detected

og_sobj <- readRDS("og_sobj.rds")
og_sobj <- SetAllIdent(og_sobj, "cell_labels")
avg_og_expr <- AverageExpression(og_sobj)
mk_expr <- avg_og_expr[, "Megakaryocytes", drop = F]
mk_expr <- mk_expr[mk_expr$Megakaryocytes > 0, , drop = F]

summary_df <- map(all_genes,
               ~map(.x,
                    ~data_frame(n_markers = sum(.x %in%
                                             rownames(mk_expr)),
                           total_genes = length(.x))))
summary_df <- map(summary_df, bind_rows, .id = "cell") %>% 
  bind_rows(., .id = "library")

plt <- ggplot(summary_df,
       aes(cell, n_markers)) +
  geom_bar(aes(fill = library), 
           stat = "identity",
           position = "dodge") +
  scale_x_discrete(labels = cell_names) + 
  scale_fill_brewer(palette = "Set1",
                    name = "",
                    labels = lib_names) +
  labs(y = "# of Megakaryocyte\ngenes detected") +
  guides(fill = guide_legend(nrow = 2)) + 
  theme(axis.text.x = element_text(angle = 45,
                                   hjust = 1),
        axis.title.x = element_blank(),
        legend.pos = "top")
  
save_plot("ngenesdetected_kirkpatrick_pbmcs.pdf", 
          plt,
          base_aspect_ratio = 0.75)

compare gene expression with scatterplots

Next I’ll plot the original expression per cell versus the average expression in MK cells, then the resampled expression.

og_sobj <- readRDS("og_sobj.rds")
og_sobj <- SetAllIdent(og_sobj, "cell_labels")
average_expression <- AverageExpression(og_sobj)  
average_mk_expression <- average_expression[,
                                            "Megakaryocytes", 
                                            drop = F]

average_mk_expression <- log1p(average_mk_expression)
average_mk_expression <- average_mk_expression %>% 
  tibble::rownames_to_column("gene")
og_expr <- og_sobj@data[, str_c(cells$mkcell_pulldown, "-1")] %>% 
  as.matrix(.) %>% 
  as.data.frame(.) %>% 
  tibble::rownames_to_column("gene")

average_mk_expression <- left_join(average_mk_expression, og_expr)

average_mk_expression <-  average_mk_expression %>% 
  gather(cell, expr, -gene, -Megakaryocytes) %>% 
  tbl_df()

## get cor:

cell_cor <- average_mk_expression %>% 
  group_by(cell) %>% 
  do(broom::tidy(cor(.$Megakaryocytes, .$expr))) %>% 
  ungroup() %>% 
  mutate(cell_cor = str_c("R = ", 
                          formatC(x, big.mark = ",", digits = 3)))

og_plot <- ggplot(average_mk_expression, 
         aes(expr, Megakaryocytes)) +
    geom_point(size = 0.5) + 
    geom_abline(aes(slope = 1, intercept = 0)) +
    geom_text(data = cell_cor,
              aes(x = 2,
                  y = max(average_mk_expression$Megakaryocytes) * 0.9, 
                  label = cell_cor),
              size = 3) +
    labs(x = "Original Cell Expression",
         y = "Megakaryocyte Cluster\nAverage Expression") +
    facet_wrap(~cell, nrow = 1) +
    coord_fixed() + 
    theme(legend.position = "top",
          strip.text = element_text(size = 8))


sobj <- SetAllIdent(sobj, "cell_labels")
average_rs_expression <- AverageExpression(sobj)  
average_rsmk_expression <- average_rs_expression[,
                                            "Megakaryocytes", 
                                            drop = F]

average_rsmk_expression <- log1p(average_rsmk_expression)
average_rsmk_expression <- average_rsmk_expression %>% 
  tibble::rownames_to_column("gene")
rs_expr <- sobj@data[, str_c(cells$mkcell_pulldown, "-1")] %>% 
  as.matrix(.) %>% 
  as.data.frame(.) %>% 
  tibble::rownames_to_column("gene")

average_rsmk_expression <- left_join(average_rsmk_expression, rs_expr)

average_rsmk_expression <-  average_rsmk_expression %>% 
  gather(cell, expr, -gene, -Megakaryocytes) %>% 
  tbl_df()

## get cor:
rs_cell_cor <- average_rsmk_expression %>% 
  group_by(cell) %>% 
  do(broom::tidy(cor(.$Megakaryocytes, .$expr))) %>% 
  ungroup() %>% 
  mutate(cell_cor = str_c("R = ", 
                          formatC(x, big.mark = ",", digits = 3)))

rs_plt <- ggplot(average_rsmk_expression, 
         aes(expr, Megakaryocytes)) +
    geom_point(size = 0.5) + 
    geom_abline(aes(slope = 1, intercept = 0)) +
    geom_text(data = rs_cell_cor,
              aes(x = 2,
                  y = max(average_mk_expression$Megakaryocytes) * 0.9, 
                  label = cell_cor),
              size = 3) +
    facet_wrap(~cell, nrow = 1) +
    coord_fixed() + 
    labs(x = "Resampled Cell Expression",
         y = "Megakaryocyte Cluster\nAverage Expression") +
    theme(legend.position = "top",
          strip.text = element_text(size = 8))

cell_cor <- bind_rows(list(original = cell_cor, 
                           resampled = rs_cell_cor), .id = "type")

cell_cor
plt <- plot_grid(og_plot, rs_plt, nrow = 2)

save_plot("expression_versus_cluster_average.pdf", plt, 
          nrow = 2, ncol = 1,
          base_height = 4,
          base_aspect_ratio = 2)

Find markers with downsampled MK cluster

Rui had a great idea to downsample the # of cells in the MK cluster and find markers with the original cells, and the subsampled cells.

There are 68 mks in this data set.

mks <- sobj@meta.data[sobj@meta.data$cell_labels == "Megakaryocytes", ]
rs_mks <- mks[mks$resampled == "original cell", ]
not_re_mks <- mks[mks$resampled == "not resampled", ]
not_mks <- sobj@meta.data[sobj@meta.data$cell_labels != "Megakaryocytes", ]

n_mks_to_test <- seq(0, nrow(not_re_mks), by = 5)

set.seed(42)
not_re_mk_sampled <- map(n_mks_to_test, 
                         ~sample_n(not_re_mks, .x))

sampled_mks <- map(not_re_mk_sampled, 
                   ~c(rownames(.x), 
                      rownames(rs_mks)))

all_cells_minus_not_sampled_mks <- map(sampled_mks,
                                       ~c(.x, rownames(not_mks)))

subsampled_markers <- map(all_cells_minus_not_sampled_mks, 
                          function(x){
                            tmp_dat <- SubsetData(sobj, cells.use = x)
                            markers <- FindMarkers(tmp_dat, 
                                                   "Megakaryocytes",
                                                   only.pos = T)
                            markers
                          })


og_rs_mks <- og_sobj@meta.data[og_sobj@meta.data$resampled, ]

sampled_mks <- map(not_re_mk_sampled, 
                   ~c(rownames(.x), 
                      rownames(og_rs_mks)))

all_cells_minus_not_sampled_mks <- map(sampled_mks,
                                       ~c(.x, rownames(not_mks)))

subsampled_markers_og <- map(all_cells_minus_not_sampled_mks,
                          function(x){
                            tmp_dat <- SubsetData(og_sobj, cells.use = x)
                            markers <- FindMarkers(tmp_dat, 
                                                   "Megakaryocytes",
                                                   only_pos = T)
                            markers
                          })
subsampled_markers <- map(subsampled_markers, 
      ~tibble::rownames_to_column(.x, "gene")) 
names(subsampled_markers) <- n_mks_to_test
subsampled_markers <- bind_rows(subsampled_markers, .id = "n_mks")
write_tsv(subsampled_markers,
          "downsampled_mk_cluster_markers_resampling.txt")

subsampled_markers_og <- map(subsampled_markers_og, 
      ~tibble::rownames_to_column(.x, "gene")) 

names(subsampled_markers_og) <- n_mks_to_test
subsampled_markers_og <- bind_rows(subsampled_markers_og, .id = "n_mks")
write_tsv(subsampled_markers_og,
          "downsampled_mk_cluster_markers_no_resampling.txt")
subsampled_markers_og <- read_tsv("downsampled_mk_cluster_markers_no_resampling.txt") 

subsampled_markers <- read_tsv("downsampled_mk_cluster_markers_resampling.txt")

og_summary <- subsampled_markers_og %>% 
  filter(p_val_adj < 0.01) %>% 
  count(n_mks)

rs_summary <- subsampled_markers %>% 
    filter(p_val_adj < 0.01) %>% 
  count(n_mks)

summary_dat <- bind_rows(list("Original Library" = og_summary,
                              "Resampled Library" = rs_summary),
          .id = "expt")
summary_dat <- mutate(summary_dat,
                      n_mks = as.integer(n_mks) + 4) ## to account for resampled cells
plt <- ggplot(summary_dat, 
       aes(n_mks, n)) +
  geom_bar(aes(fill = expt), 
           stat = "identity", position = "dodge") +
  scale_fill_brewer(palette = "Set1", name = "") +
  labs(x = "# of Megakaryocytes in Cluster",
       y = "# of Megakaryocyte\nmarkers discovered") +
  theme(legend.pos = "top") +
  guides(fill = guide_legend(nrow = 2))

save_plot("resampling_marker_discovery_sensitivity.pdf", 
          plt, 
          base_height = 4,
          base_aspect_ratio = 1)
mks <- sobj@meta.data[sobj@meta.data$cell_labels == "Megakaryocytes", ]
rs_mks <- mks[mks$resampled == "original cell", ]
not_mks <- sobj@meta.data[sobj@meta.data$cell_labels != "Megakaryocytes", ]
only4mks <- c(rownames(rs_mks),
  rownames(not_mks))
sub_obj <- SubsetData(og_sobj, cells.use = only4mks)

sub_obj <- RunPCA(sub_obj, pc.genes = rownames(sub_obj@data), 
               pcs.compute = 20, 
               do.print = F, seed.use = 20180525)
sub_obj <- RunTSNE(sub_obj, dims.use = 1:15, seed.use = 20180525)
sub_obj <- FindClusters(sub_obj,
                     dims.use = 1:15, 
                     k.param = 15,
                     resolution = 1.2, 
                     print.output = F, 
                     force.recalc = T,
                     random.seed = 20180525)

plot_feature(sub_obj, gene = "GNG11", pt.alpha = 1)

TSNEPlot(sub_obj, group.by = "resampled")

TSNEPlot(sub_obj, group.by = "res.1.2")